Q Learning

$Q_{s,a} = (1-\alpha).Q_{s,a} + \alpha \left( R_{s,a} + \gamma . max(Q_{s',a'})\right)$


In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import gym
import numpy as np
from gym.envs.registration import register
from gym import wrappers
import shutil

In [2]:
# register(
#     id='FrozenLakeNotSlippery-v0',
#     entry_point='gym.envs.toy_text:FrozenLakeEnv',
#     kwargs={'map_name' : '4x4', 'is_slippery': False},
#     max_episode_steps=100,
#     reward_threshold=0.78, # optimum = .8196
# )

# env = gym.make('FrozenLakeNotSlippery-v0')

In [5]:
env = gym.make('FrozenLake-v0')

In [6]:
Q= np.zeros((env.observation_space.n, env.action_space.n)) #16x4
alpha=0.1
beta=0.001
gamma=0.95
num_episodes = 1000
reward_list=[]
for i in range(num_episodes):
    s = env.reset();
    done=False
    while done==False:
        if np.random.rand() < beta:
            a = np.random.randint(env.action_space.n)
        else:
            a = np.argmax(Q[s,:] )
        s_next,reward,done,info = env.step(a)
        if done:#reaching goal or falling into a hole
            r = 1.0 if reward > 0.0 else -1.0
        else:#reaching maximum step
            r = 0.0
        Q[s,a]= (1-alpha)*Q[s,a] + alpha*( r + gamma*np.max(Q[s_next,:]) )
        if done == True:
            break
        s=s_next
    reward_list.append(reward)
                                        
plt.plot(np.convolve(np.ones(100),reward_list,"valid"))


Out[6]:
[<matplotlib.lines.Line2D at 0x11bca4c50>]

In [10]:
print("Final Q-Table Values")
print(Q)


Final Q-Table Values
[[  8.49277351e-02  -1.04291755e-01  -1.21457722e-01  -1.44070851e-01]
 [ -2.28147314e-01  -2.80513399e-01  -2.42035100e-01   8.72715986e-04]
 [ -2.11063200e-01  -2.09781582e-01  -2.10260619e-01  -1.56850154e-02]
 [ -2.52386723e-01  -2.11843499e-01  -2.62172823e-01  -1.79405373e-02]
 [  1.14185937e-01  -1.50347177e-01  -1.57110559e-01  -2.53992360e-01]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [ -5.84954882e-01  -6.07710282e-01  -6.05389046e-01  -6.14815208e-01]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [ -1.54010155e-01  -1.00000000e-01  -1.60026635e-01   1.50314053e-01]
 [ -1.00000000e-01   1.81920304e-01  -1.00000000e-01  -1.01083697e-01]
 [  9.91975740e-02  -2.32813265e-01  -2.16187282e-01  -2.28845393e-01]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [ -1.00000000e-01  -1.90000000e-01   3.75455641e-01  -1.00000000e-01]
 [ -1.09029473e-03  -1.29651072e-05  -1.41376238e-03   5.22837701e-01]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]]

In [12]:
s = env.reset()
d=False
n=0
while d==False:
    n+=1
    a = np.argmax(Q[s,:])
    s,r,d,x = env.step(a)
    #print("%s %s %s %s"%(s,r,d,x))
    env.render()
print(n)


  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
HFFG
  (Right)
SFFF
FHFH
FFFH
HFFG
  (Up)
SFFF
FHFH
FFFH
HFFG
30

In [9]:
env.close()
#gym.upload('/tmp/FrozenLake_01', api_key='sk_o9OoYpSkKamkW8MrKuHw')

In [ ]: